package lx;

import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class dmo3 {
    public static void main(String[] args) throws IOException {
        //https://zhidao.baidu.com/question/143118960.html
        //来爬取姓氏大全
        URL url = new URL("https://zhidao.baidu.com/question/143118960.html");
        //连接网站
        URLConnection coon = url.openConnection();
        //创建一个字节流去读取网站中的数据

        //把字节流转换成字符流去读取字符 因为coon.getInputStream()只能是字节流
        //网站利有中文所以我要转成字符流来读取
        //  InputStreamReader br = new InputStreamReader(coon.getInputStream());
        BufferedReader br = new BufferedReader(new InputStreamReader(coon.getInputStream()));
        String line;
        //写出正则表达式
        String regex = "[\\u4e00-\\u9fa5&&[^氏]]{1,2}(\\s){1}";
        //获取正则表达式的对象
        Pattern pattern = Pattern.compile(regex);
        while ((line = br.readLine()) != null) {
            //利用文本匹配器对象matcher按照pattern规则 去读取字符
            Matcher matcher = pattern.matcher(line);
            while (matcher.find()) {

                System.out.println(matcher.group());
            }
        }
        br.close();


    }
}
